import os
train = os.listdir('train')
test=os.listdir('test')
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
train_cat = list(filter(lambda x:x[:3]=='cat' ,train))
train_dog = list(filter(lambda x:x[:3]=='dog', train))
x = ['cat number in train','dog number in train','test number']
y = [len(train_cat),len(train_dog),len(test)]
sns.barplot(x=x,y=y)
print('训练集总量:',len(train))
print('训练集猫的数量:',len(train_cat))
print('训练集狗的数量:',len(train_dog))
print('测试集总量:',len(test))
import random
import os
import numpy as np
import math
import json
from keras.preprocessing import image
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#从文件读取异常图片文件名
def read_data(filename):
with open(filename, 'r') as f:
data_dict = json.load(f)
outliers = data_dict['outliers']
return outliers
def display_img_list(data_dir, img_list, fig_w=5):
img_num = len(img_list)
fig_h = math.ceil(img_num/fig_w)
plt.figure(figsize=(18, 5*fig_h))
for i in range(img_num):
if os.path.exists(data_dir + os.sep + img_list[i]):
plt.subplot(fig_h, fig_w, i+1)
plt.gca().set_title(img_list[i])
plt.imshow(image.load_img(data_dir + os.sep + img_list[i]))
plt.show()
#获取总的异常图片
total_outliers = read_data("total_outliers.json")
print('the total number of outliers:',len(total_outliers))
print(total_outliers)
display_img_list("train", total_outliers)
def remove_outliers(data_dir,img_list):
img_num = len(img_list)
for i in range(img_num):
if os.path.exists(data_dir + os.sep + img_list[i]):
os.remove(data_dir + os.sep + img_list[i])
print(data_dir + os.sep + img_list[i]+'----removed')
pre_remove_len = len(os.listdir('train'))
remove_outliers('train', total_outliers)
print('train移除前数量:',pre_remove_len)
print('train移除后数量:',len(os.listdir('train')))
print('train移除图片数量:',len(total_outliers))
from sklearn.model_selection import train_test_split
import seaborn as sns
import os
import shutil
train = os.listdir('train')
test=os.listdir('test')
train_cat = list(filter(lambda x:x[:3]=='cat' ,train))
train_dog = list(filter(lambda x:x[:3]=='dog', train))
x = ['cat number in train','dog number in train','test number']
y = [len(train_cat),len(train_dog),len(test)]
sns.barplot(x=x,y=y)
print('训练集总量:',len(train))
print('训练集猫的数量:',len(train_cat))
print('训练集狗的数量:',len(train_dog))
print('测试集总量:',len(test))
import os
from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input
from keras.applications.inception_v3 import InceptionV3
from keras.applications import inception_v3
import numpy as np
from tqdm import tqdm
import random
train_filenames = os.listdir('train')
print('训练图片总量:',len(train_filenames))
random.shuffle(train_filenames)
print(train_filenames[:10])
n= len(train_filenames)
Image_size = 299
X = np.zeros((n,Image_size,Image_size,3),dtype = np.float16)
Y = np.zeros((n,1),dtype = np.float16)
for i, img_file in enumerate(tqdm(train_filenames)):
img = image.load_img('./train/'+img_file, target_size = (Image_size,Image_size))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
X[i] = preprocess_input(x)
Y[i] = 1 if 'dog' in img_file else 0
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size = 0.2,random_state=42)
print(X_train.shape, Y_valid.shape)
print('success')
from keras.applications import inception_v3
from keras.models import Model
from keras.layers import Dense, Dropout, GlobalAveragePooling2D, Input, Lambda
from keras import optimizers
# 构建不带分类器的预训练模型
base_model = inception_v3.InceptionV3(input_tensor = Input((Image_size,Image_size,3)), weights = 'imagenet',include_top = False)
# 添加全局平均池化层
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.5)(x)
# 添加一个分类器
predictions = Dense(1, activation = 'sigmoid')(x)
model = Model(inputs = base_model.input, outputs = predictions)
print('模型层数',len(base_model.layers))
# we chose to train the top 2 inception blocks, i.e. we will freeze
# the first 280 layers and unfreeze the rest:
for layer in base_model.layers[:280]:
layer.trainable = False
for layer in base_model.layers[280:]:
layer.trainable = True
opt = optimizers.Adam(lr=1e-5, decay=1e-6)
model.compile(optimizer = opt,
loss = 'binary_crossentropy',
metrics=['accuracy'])
model.summary()
from keras.utils.visualize_util import plot
plot(model, to_file='model_InceptionV3.png',show_shapes=True)
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
callbacks = [EarlyStopping(monitor='val_loss', patience=1, verbose=0, mode='min'),
ModelCheckpoint(filepath='model_file002_InceptionV3.h5', monitor='val_loss', verbose=0, save_best_only=True)]
result = model.fit(X_train,Y_train, batch_size = 128,epochs = 8, validation_data=(X_valid, Y_valid), callbacks=callbacks)
import matplotlib.pyplot as plt
def plot_result(result, model_name):
# summarize history for accuracy
plt.plot(result.history['acc'])
plt.plot(result.history['val_acc'])
plt.title('%s accuracy'%model_name)
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validate'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(result.history['loss'])
plt.plot(result.history['val_loss'])
plt.title('%s loss'%model_name)
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validate'], loc='upper left')
plt.show()
plot_result(result,'InceptionV3')
import os
test_filenames = os.listdir('test')
print(len(test_filenames))
import random
random.shuffle(test_filenames)
print(test_filenames[:10])
from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input
import numpy as np
from tqdm import tqdm
n = len(test_filenames)
Image_size = 299
X_test = np.zeros((n,Image_size,Image_size,3),dtype = np.float16)
for i, img_file in enumerate(tqdm(test_filenames)):
img = image.load_img('./test/'+img_file, target_size = (Image_size,Image_size))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
X_test[i] = preprocess_input(x)
y_pred = model.predict(X_test, verbose = 1)
import pandas as pd
y_pred = y_pred.clip(min = 0.005, max = 0.995)
df = pd.read_csv("sampleSubmission.csv")
for i, fname in enumerate(test_filenames):
index = int(fname[fname.rfind(os.sep)+1:fname.rfind('.')])
df.loc[index-1, ['label']]= y_pred[i]
df.to_csv('predict002_InceptionV3.csv',index = None)
df.head(10)
import pandas as pd
y_pred = y_pred.clip(min = 0.01, max = 0.99)
df = pd.read_csv("sampleSubmission.csv")
for i, fname in enumerate(test_filenames):
index = int(fname[fname.rfind(os.sep)+1:fname.rfind('.')])
df.loc[index-1, ['label']]= y_pred[i]
df.to_csv('predict003_InceptionV3.csv',index = None)
df.head(10)
import matplotlib.pyplot as plt
import random
import cv2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.figure(figsize=(12, 14))
for i in range(16):
plt.subplot(4, 4, i+1)
img = cv2.imread('test/%d.jpg' % random.randint(1, 12500))
plt.imshow(img)
img = cv2.resize(img, (299, 299))
x = img.copy()
# 用xception里面的preprocess_input来预处理处理图片
x.astype(np.float32)
x = (x - 255.0/2)/ 255.0
prediction = model.predict(np.expand_dims(x, axis=0))
prediction = prediction[0]
if prediction < 0.5:
plt.title('cat %.2f%%' % (100 - prediction*100))
else:
plt.title('dog %.2f%%' % (prediction*100))